Load required packages.
library( tidyverse )
library( dplyr )
library( gridExtra )
library( mapview )
library( leaflet )
library( leaflet.extras )
Set up workspace, i.e., remove all existing data from working memory and load data from CSV file.
rm( list=ls() )
df <- read.csv("./data/airbnb_clean.csv")
room_type_counts <- df %>%
arrange(desc(room_type)) %>%
count(room_type)
room_type_counts
## room_type n
## 1 Entire home/apt 25393
## 2 Private room 22306
## 3 Shared room 1159
ggplot(room_type_counts, aes(x = "", y = n, fill = room_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() #+
#theme(legend.position="none") +
#geom_text(aes(y = ypos, label = room_type), color = "white", size=6) +
#scale_fill_brewer(palette="Set1")
ggplot(df, aes(x = last_review_age, y = price)) +
geom_point()
## Warning: Removed 10037 rows containing missing values (`geom_point()`).
ggplot(df, aes(x = availability_365, y = price)) +
geom_point()
ggplot(df, mapping = aes(y = log_price, x = room_type)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (`stat_boxplot()`).
df %>%
ggplot(mapping = aes(x = room_type, y = log_price)) +
geom_violin()
## Warning: Removed 11 rows containing non-finite values (`stat_ydensity()`).
df %>%
filter(!is.na(last_review_age)) %>%
mutate(log_last_review_age = log(as.integer(last_review_age))) %>%
ggplot(mapping = aes(x = room_type, y = log_last_review_age)) +
geom_violin()
## Warning: Removed 88 rows containing non-finite values (`stat_ydensity()`).
neighbourhood_group_counts <- df %>%
count(neighbourhood_group)
ggplot(neighbourhood_group_counts, aes(x = "", y = n, fill = neighbourhood_group)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() #+
ggplot(df, aes(x = distance_from_center, y = price, color = neighbourhood_group)) +
geom_point() +
theme_minimal()
ggplot(df, mapping = aes(y = log_price, x = neighbourhood_group)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (`stat_boxplot()`).
#df %>%
# filter(log_price >= 0) %>%
# ggplot(mapping = aes(x = log_price, y = neighbourhood_group)) +
# geom_density_ridges(alpha = 0.5)
#df %>%
# ggplot(mapping = aes(x = distance_from_center, y = neighbourhood_group)) +
# geom_density_ridges(alpha = 0.5)
df %>%
ggplot(mapping = aes(x = room_type, y = distance_from_center)) +
geom_boxplot()
df %>%
ggplot(mapping = aes(x = longitude, y = latitude, color = room_type)) +
geom_point()
plots <- lapply(unique(df$room_type), function(r_type) {
print(r_type)
df %>%
filter(room_type == r_type) %>%
ggplot(mapping = aes(x = longitude, y = latitude)) +
geom_point() +
ggtitle(r_type)
})
## [1] "Private room"
## [1] "Entire home/apt"
## [1] "Shared room"
grid.arrange(grobs = plots, ncol = 3)
get_most_expensive_neighbourhoods_in_group <- function(group) {
most_expensive_neighbourhoods <- df %>%
filter(neighbourhood_group == group) %>%
group_by(neighbourhood) %>%
summarize(sum_price = sum(price)) %>%
arrange(desc(sum_price))
tmp_df <- df %>%
filter(neighbourhood_group == group) %>%
group_by(neighbourhood, room_type) %>%
summarise(sum_price = sum(price)) %>%
arrange(match(neighbourhood, most_expensive_neighbourhoods$neighbourhood)) %>%
mutate(neighbourhood = as.factor(neighbourhood)) %>%
head(20)
?reorder
return(
tmp_df %>%
ggplot(mapping = aes(x = factor(neighbourhood, level = most_expensive_neighbourhoods$neighbourhood), y = sum_price, fill = room_type)) +
geom_col(stat = "identity") +
labs(
title = paste("Most expensive neighbourhoods in", group),
x = "Neighbourhood",
y = "Price in $"
)
)
}
get_most_expensive_neighbourhoods_in_group("Queens")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
get_most_expensive_neighbourhoods_in_group("Brooklyn")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
get_most_expensive_neighbourhoods_in_group("Manhattan")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
get_most_expensive_neighbourhoods_in_group("Staten Island")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
get_most_expensive_neighbourhoods_in_group("Bronx")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
get_most_expensive_neighbourhoods_in_group("Queens")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
plot_map <- function (df, zoom, ratio) {
center_lng <- (df %>% summarize(mean(longitude)))[1, 1]
center_lat <- (df %>% summarize(mean(latitude)))[1, 1]
map_max <- nrow(df) * ratio
label_opt <- labelOptions(noHide = T, textsize = "10px")
df %>%
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$OpenStreetMap.DE) %>%
setView(center_lng, center_lat, zoom) %>%
addHeatmap(lng = ~longitude, lat = ~latitude, max = map_max, radius = 20, blur = 10) %>%
addCircleMarkers(lat = 40.6897, lng = -74.0445, label = "Statue of Liberty", labelOptions = label_opt) %>%
addCircleMarkers(lat = 40.7484, lng = -73.9856, label = "Empire State Building") %>%
addCircleMarkers(lat = 40.7826, lng = -73.9655, label = "Central Park") %>%
addCircleMarkers(lat = 40.7579, lng = -73.9855, label = "Times Square") %>%
addCircleMarkers(lat = 40.7061, lng = -73.9967, label = "Brooklyn Bridge")
}
plot_map(df, zoom = 10, ratio = 0.05)
manhattan <- df %>%
filter(neighbourhood_group == "Manhattan")
plot_map(manhattan, 11.5, ratio = 0.02)
bronx <- df %>%
filter(neighbourhood_group == "Bronx")
plot_map(bronx, 11.5, ratio = 0.01)
queens <- df %>%
filter(neighbourhood_group == "Queens")
plot_map(queens, 11, ratio = 0.02)
staten_island <- df %>%
filter(neighbourhood_group == "Staten Island")
plot_map(staten_island, 11, ratio = 0.05)
brooklyn <- df %>%
filter(neighbourhood_group == "Brooklyn")
plot_map(brooklyn, 11, ratio = 0.05)
private_room <- df %>%
filter(room_type == "Private room")
plot_map(private_room, 11, ratio = 0.03)
center_lng <- (private_room %>% summarize(mean(longitude)))[1, 1]
center_lat <- (private_room %>% summarize(mean(latitude)))[1, 1]
private_room %>%
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$OpenStreetMap.DE) %>%
setView(center_lng, center_lat, 10) %>%
addMarkers(clusterOptions = markerClusterOptions())
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
entire_home <- df %>%
filter(room_type == "Entire home/apt")
plot_map(entire_home, 11, ratio = 0.03)
plot_marker_map <- function(df, zoom) {
center_lng <- (most_expensive %>% summarize(mean(longitude)))[1, 1]
center_lat <- (most_expensive %>% summarize(mean(latitude)))[1, 1]
df %>%
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$OpenStreetMap.DE) %>%
setView(center_lng, center_lat, zoom) %>%
addCircleMarkers(lng = ~longitude, lat= ~latitude, label = ~name)
}
most_expensive <- df %>%
arrange(desc(price)) %>%
head(10)
plot_marker_map(most_expensive, 11)
least_expensive <- df %>%
arrange(price) %>%
head(10)
plot_marker_map(least_expensive, 11)
get_hull <- function(group) {
df_points <- df %>%
filter(neighbourhood_group == group) %>%
select(longitude, latitude, neighbourhood_group)
return(df_points[chull(df_points),])
}
manhattan_hull <- get_hull("Manhattan")
queens_hull <- get_hull("Queens")
bronx_hull <- get_hull("Bronx")
staten_island_hull <- get_hull("Staten Island")
brooklyn_hull <- get_hull("Brooklyn")
base_map <- df %>%
ggplot(mapping = aes(x = longitude, y = latitude)) +
geom_polygon(data = manhattan_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
geom_polygon(data = queens_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
geom_polygon(data = bronx_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
geom_polygon(data = staten_island_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
geom_polygon(data = brooklyn_hull, aes(x = longitude, y = latitude), fill = NA, color = "black")
base_map +
geom_bin2d(bins = 500) +
stat_density2d(
aes(fill = ..level.., alpha = ..level..),
geom = "polygon"
) +
scale_fill_gradient(low = "green", high = "red") +
labs(title = "Distribution of Listings") +
theme_minimal()
## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df %>%
filter(neighbourhood_group == "Manhattan") %>%
ggplot(mapping = aes(x = longitude, y = latitude)) +
geom_polygon(data = manhattan_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
geom_bin2d(bins = 600) +
stat_density2d(
aes(fill = ..level.., alpha = ..level..),
geom = "polygon"
) +
scale_fill_gradient(low = "green", high = "red") +
labs(title = "Listing Density in Manhattan") +
theme_minimal()
df %>%
ggplot(mapping = aes(x = longitude, y = latitude)) +
geom_bin2d(bins = 200) +
stat_density2d(
aes(fill = ..level.., alpha = ..level..),
geom = "polygon"
) +
lims(x = c(-74.1, -73.8)) +
scale_fill_gradient(low = "green", high = "red") +
labs(title = "Distribution of Listings") +
theme_minimal()
## Warning: Removed 979 rows containing non-finite values (`stat_bin2d()`).
## Warning: Removed 979 rows containing non-finite values (`stat_density2d()`).
## Warning: Removed 20 rows containing missing values (`geom_tile()`).
df %>%
filter(neighbourhood_group == "Manhattan") %>%
ggplot(mapping = aes(x = longitude, y = latitude)) +
geom_bin2d(bins = 200) +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Heatmap of Points") +
theme_minimal()